#Basic python library which need to import
import pandas as pd
import numpy as np
#Date stuff
from datetime import datetime
from datetime import timedelta
#Library for Nice graphing
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
%matplotlib inline
#Library for statistics operation
import scipy.stats as stats
# Date Time library
from datetime import datetime
#Machine learning Library
import statsmodels.api as sm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning) C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release. from numpy.core.umath_tests import inner1d
# Importing training data set
train = pd.read_csv("train.csv")
#Import Test Data
test=pd.read_csv("test.csv")
# Import Store data set
stores = pd.read_csv("stores.csv")
# Now import features data set
feature = pd.read_csv("features.csv")
-(train+Store+Feature)
-(test+Store+Feature)
# For Train data set
train_bt = pd.merge(train,stores)
train = pd.merge(train_bt,feature)
#For test data set
test_bt = pd.merge(test,stores)
test= pd.merge(test_bt,feature)
train.head(2)
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2010-02-05 | 24924.50 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
test.head(2)
Store | Dept | Date | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
1 | 1 | 2 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
print (train.info())
print ("*****************************************")
print (test.info())
<class 'pandas.core.frame.DataFrame'> Int64Index: 421570 entries, 0 to 421569 Data columns (total 16 columns): Store 421570 non-null int64 Dept 421570 non-null int64 Date 421570 non-null object Weekly_Sales 421570 non-null float64 IsHoliday 421570 non-null bool Type 421570 non-null object Size 421570 non-null int64 Temperature 421570 non-null float64 Fuel_Price 421570 non-null float64 MarkDown1 150681 non-null float64 MarkDown2 111248 non-null float64 MarkDown3 137091 non-null float64 MarkDown4 134967 non-null float64 MarkDown5 151432 non-null float64 CPI 421570 non-null float64 Unemployment 421570 non-null float64 dtypes: bool(1), float64(10), int64(3), object(2) memory usage: 51.9+ MB None ***************************************** <class 'pandas.core.frame.DataFrame'> Int64Index: 115064 entries, 0 to 115063 Data columns (total 15 columns): Store 115064 non-null int64 Dept 115064 non-null int64 Date 115064 non-null object IsHoliday 115064 non-null bool Type 115064 non-null object Size 115064 non-null int64 Temperature 115064 non-null float64 Fuel_Price 115064 non-null float64 MarkDown1 114915 non-null float64 MarkDown2 86437 non-null float64 MarkDown3 105235 non-null float64 MarkDown4 102176 non-null float64 MarkDown5 115064 non-null float64 CPI 76902 non-null float64 Unemployment 76902 non-null float64 dtypes: bool(1), float64(9), int64(3), object(2) memory usage: 13.3+ MB None
# tale only those values whose sales is positive.
train = train[train['Weekly_Sales']>0]
numeric_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_train=[key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']]
# Train Numerical Data
train_num=train[numeric_var_train]
# Train Categorical Data
train_cat=train[cat_var_train]
print (numeric_var_train)
print (cat_var_train)
['Store', 'Dept', 'Weekly_Sales', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'] ['Date', 'Type']
# Use a general function that returns multiple values
def var_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()],
index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])
num_summary=train_num.apply(lambda x: var_summary(x)).T
num_summary
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=train_cat.apply(lambda x: cat_summary(x))
cat_summary
Date | Type | |
---|---|---|
N | 420212 | 420212 |
NMISS | 0 | 0 |
ColumnsNames | 2011-12-23 3018 2011-11-25 3016 2011-12-... | A 214961 B 162787 C 42464 Name: Type... |
numeric_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_test=[key for key in dict(test.dtypes) if dict(test.dtypes)[key] in ['object']]
# Train Numerical Data
test_num=test[numeric_var_test]
# Train Categorical Data
test_cat=test[cat_var_test]
print (numeric_var_test)
print (cat_var_test)
['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment'] ['Date', 'Type']
# Numerical data summary report
num_summary=test_num.apply(lambda x: var_summary(x)).T
num_summary.head()
N | NMISS | SUM | MEAN | MEDIAN | STD | VAR | MIN | P1 | P5 | P10 | P25 | P50 | P75 | P90 | P95 | P99 | MAX | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Store | 115064.0 | 0.0 | 2.558817e+06 | 22.238207 | 22.000 | 12.809930 | 1.640943e+02 | 1.000 | 1.000 | 3.000 | 5.000 | 11.000 | 22.000 | 33.000 | 40.000 | 43.000 | 45.000 | 45.000 |
Dept | 115064.0 | 0.0 | 5.101883e+06 | 44.339524 | 37.000 | 30.656410 | 9.398155e+02 | 1.000 | 1.000 | 4.000 | 7.000 | 18.000 | 37.000 | 74.000 | 92.000 | 95.000 | 98.000 | 99.000 |
Size | 115064.0 | 0.0 | 1.570597e+10 | 136497.688921 | 140167.000 | 61106.926438 | 3.734056e+09 | 34875.000 | 34875.000 | 39690.000 | 39910.000 | 93638.000 | 140167.000 | 202505.000 | 204184.000 | 206302.000 | 219622.000 | 219622.000 |
Temperature | 115064.0 | 0.0 | 6.206760e+06 | 53.941804 | 54.470 | 18.724153 | 3.505939e+02 | -7.290 | 11.440 | 23.980 | 29.970 | 39.820 | 54.470 | 67.350 | 79.480 | 83.820 | 92.140 | 101.950 |
Fuel_Price | 115064.0 | 0.0 | 4.121070e+05 | 3.581546 | 3.606 | 0.239442 | 5.733244e-02 | 2.872 | 2.957 | 3.161 | 3.227 | 3.431 | 3.606 | 3.766 | 3.866 | 3.951 | 4.079 | 4.125 |
# categorical data summary report
def cat_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.value_counts()],
index=['N', 'NMISS', 'ColumnsNames'])
cat_summary=test_cat.apply(lambda x: cat_summary(x))
cat_summary
Date | Type | |
---|---|---|
N | 115064 | 115064 |
NMISS | 0 | 0 |
ColumnsNames | 2012-12-21 3002 2012-12-07 2989 2012-12-... | A 58713 B 44500 C 11851 Name: Type, d... |
# Run Pandas profilingto see the over all report
import pandas_profiling
pandas_profiling.ProfileReport(train)
Dataset info
Number of variables | 16 |
---|---|
Number of observations | 421570 |
Total Missing (%) | 21.1% |
Total size in memory | 51.9 MiB |
Average record size in memory | 129.0 B |
Variables types
Numeric | 13 |
---|---|
Categorical | 2 |
Boolean | 1 |
Date | 0 |
Text (Unique) | 0 |
Rejected | 0 |
Unsupported | 0 |
Warnings
Date
has a high cardinality: 143 distinct values WarningMarkDown1
has 270889 / 64.3% missing values MissingMarkDown2
has 310322 / 73.6% missing values MissingMarkDown3
has 284479 / 67.5% missing values MissingMarkDown4
has 286603 / 68.0% missing values MissingMarkDown5
has 270138 / 64.1% missing values MissingCPI
Numeric
Distinct count | 2145 |
---|---|
Unique (%) | 0.5% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 171.2 |
---|---|
Minimum | 126.06 |
Maximum | 227.23 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 126.06 |
---|---|
5-th percentile | 126.5 |
Q1 | 132.02 |
Median | 182.32 |
Q3 | 212.42 |
95-th percentile | 221.94 |
Maximum | 227.23 |
Range | 101.17 |
Interquartile range | 80.394 |
Descriptive statistics
Standard deviation | 39.159 |
---|---|
Coef of variation | 0.22873 |
Kurtosis | -1.8297 |
Mean | 171.2 |
MAD | 38.066 |
Skewness | 0.085219 |
Sum | 72174000 |
Variance | 1533.4 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
129.8555333 | 711 | 0.2% |
|
131.1083333 | 708 | 0.2% |
|
129.84596670000002 | 707 | 0.2% |
|
130.38490320000002 | 706 | 0.2% |
|
130.683 | 706 | 0.2% |
|
131.0756667 | 706 | 0.2% |
|
130.6457931 | 706 | 0.2% |
|
130.7196333 | 705 | 0.2% |
|
130.4546207 | 705 | 0.2% |
|
129.98454840000002 | 704 | 0.2% |
|
Other values (2135) | 414506 | 98.3% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
126.064 | 678 | 0.2% |
|
126.0766452 | 679 | 0.2% |
|
126.08545159999998 | 675 | 0.2% |
|
126.08929029999999 | 682 | 0.2% |
|
126.1019355 | 686 | 0.2% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
227.01841659999997 | 69 | 0.0% |
|
227.0369359 | 70 | 0.0% |
|
227.16939190000002 | 63 | 0.0% |
|
227.21428799999998 | 62 | 0.0% |
|
227.2328068 | 63 | 0.0% |
|
Date
Categorical
Distinct count | 143 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
2011-12-23 |
|
---|---|
2011-11-25 |
|
2011-12-16 |
|
Other values (140) |
412509
|
Value | Count | Frequency (%) | |
2011-12-23 | 3027 | 0.7% |
|
2011-11-25 | 3021 | 0.7% |
|
2011-12-16 | 3013 | 0.7% |
|
2011-12-09 | 3010 | 0.7% |
|
2012-02-17 | 3007 | 0.7% |
|
2011-12-30 | 3003 | 0.7% |
|
2012-02-10 | 3001 | 0.7% |
|
2011-12-02 | 2994 | 0.7% |
|
2012-03-02 | 2990 | 0.7% |
|
2012-10-12 | 2990 | 0.7% |
|
Other values (133) | 391514 | 92.9% |
|
Dept
Numeric
Distinct count | 81 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 44.26 |
---|---|
Minimum | 1 |
Maximum | 99 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
---|---|
5-th percentile | 4 |
Q1 | 18 |
Median | 37 |
Q3 | 74 |
95-th percentile | 95 |
Maximum | 99 |
Range | 98 |
Interquartile range | 56 |
Descriptive statistics
Standard deviation | 30.492 |
---|---|
Coef of variation | 0.68893 |
Kurtosis | -1.2156 |
Mean | 44.26 |
MAD | 26.537 |
Skewness | 0.35822 |
Sum | 18658822 |
Variance | 929.77 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1 | 6435 | 1.5% |
|
10 | 6435 | 1.5% |
|
38 | 6435 | 1.5% |
|
21 | 6435 | 1.5% |
|
67 | 6435 | 1.5% |
|
16 | 6435 | 1.5% |
|
14 | 6435 | 1.5% |
|
13 | 6435 | 1.5% |
|
79 | 6435 | 1.5% |
|
81 | 6435 | 1.5% |
|
Other values (71) | 357220 | 84.7% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 6435 | 1.5% |
|
2 | 6435 | 1.5% |
|
3 | 6435 | 1.5% |
|
4 | 6435 | 1.5% |
|
5 | 6347 | 1.5% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
95 | 6435 | 1.5% |
|
96 | 4854 | 1.2% |
|
97 | 6278 | 1.5% |
|
98 | 5836 | 1.4% |
|
99 | 862 | 0.2% |
|
Fuel_Price
Numeric
Distinct count | 892 |
---|---|
Unique (%) | 0.2% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3.361 |
---|---|
Minimum | 2.472 |
Maximum | 4.468 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 2.472 |
---|---|
5-th percentile | 2.653 |
Q1 | 2.933 |
Median | 3.452 |
Q3 | 3.738 |
95-th percentile | 4.029 |
Maximum | 4.468 |
Range | 1.996 |
Interquartile range | 0.805 |
Descriptive statistics
Standard deviation | 0.45851 |
---|---|
Coef of variation | 0.13642 |
Kurtosis | -1.1854 |
Mean | 3.361 |
MAD | 0.4032 |
Skewness | -0.1049 |
Sum | 1416900 |
Variance | 0.21024 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
3.638 | 2548 | 0.6% |
|
3.63 | 2164 | 0.5% |
|
2.7710000000000004 | 1917 | 0.5% |
|
3.891 | 1856 | 0.4% |
|
3.594 | 1796 | 0.4% |
|
3.5239999999999996 | 1793 | 0.4% |
|
3.523 | 1792 | 0.4% |
|
2.72 | 1790 | 0.4% |
|
3.6660000000000004 | 1778 | 0.4% |
|
2.78 | 1656 | 0.4% |
|
Other values (882) | 402480 | 95.5% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
2.472 | 38 | 0.0% |
|
2.513 | 45 | 0.0% |
|
2.5140000000000002 | 906 | 0.2% |
|
2.52 | 39 | 0.0% |
|
2.533 | 42 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
4.294 | 363 | 0.1% |
|
4.301 | 360 | 0.1% |
|
4.308 | 168 | 0.0% |
|
4.449 | 358 | 0.1% |
|
4.468 | 368 | 0.1% |
|
IsHoliday
Boolean
Distinct count | 2 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Mean | 0.070358 |
---|
True |
|
---|---|
(Missing) |
391909
|
Value | Count | Frequency (%) | |
True | 29661 | 7.0% |
|
(Missing) | 391909 | 93.0% |
|
MarkDown1
Numeric
Distinct count | 2278 |
---|---|
Unique (%) | 0.5% |
Missing (%) | 64.3% |
Missing (n) | 270889 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7246.4 |
---|---|
Minimum | 0.27 |
Maximum | 88647 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 0.27 |
---|---|
5-th percentile | 149.19 |
Q1 | 2240.3 |
Median | 5347.4 |
Q3 | 9210.9 |
95-th percentile | 21801 |
Maximum | 88647 |
Range | 88646 |
Interquartile range | 6970.6 |
Descriptive statistics
Standard deviation | 8291.2 |
---|---|
Coef of variation | 1.1442 |
Kurtosis | 17.606 |
Mean | 7246.4 |
MAD | 5262.8 |
Skewness | 3.3418 |
Sum | 1091900000 |
Variance | 68744000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1.5 | 102 | 0.0% |
|
460.73 | 102 | 0.0% |
|
175.64 | 93 | 0.0% |
|
1282.42 | 75 | 0.0% |
|
9264.48 | 75 | 0.0% |
|
686.24 | 75 | 0.0% |
|
5924.71 | 75 | 0.0% |
|
1483.17 | 75 | 0.0% |
|
3242.59 | 74 | 0.0% |
|
10671.71 | 74 | 0.0% |
|
Other values (2267) | 149861 | 35.5% |
|
(Missing) | 270889 | 64.3% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
0.27 | 51 | 0.0% |
|
0.5 | 49 | 0.0% |
|
1.5 | 102 | 0.0% |
|
1.94 | 50 | 0.0% |
|
2.12 | 52 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
62567.6 | 66 | 0.0% |
|
65021.23 | 73 | 0.0% |
|
75149.79 | 73 | 0.0% |
|
78124.5 | 70 | 0.0% |
|
88646.76 | 68 | 0.0% |
|
MarkDown2
Numeric
Distinct count | 1500 |
---|---|
Unique (%) | 0.4% |
Missing (%) | 73.6% |
Missing (n) | 310322 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3334.6 |
---|---|
Minimum | -265.76 |
Maximum | 104520 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -265.76 |
---|---|
5-th percentile | 1.95 |
Q1 | 41.6 |
Median | 192 |
Q3 | 1926.9 |
95-th percentile | 16497 |
Maximum | 104520 |
Range | 104790 |
Interquartile range | 1885.3 |
Descriptive statistics
Standard deviation | 9475.4 |
---|---|
Coef of variation | 2.8415 |
Kurtosis | 37.59 |
Mean | 3334.6 |
MAD | 4690.4 |
Skewness | 5.4413 |
Sum | 370970000 |
Variance | 89782000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
1.91 | 539 | 0.1% |
|
3.0 | 493 | 0.1% |
|
0.5 | 485 | 0.1% |
|
1.5 | 471 | 0.1% |
|
4.0 | 367 | 0.1% |
|
6.0 | 365 | 0.1% |
|
7.64 | 354 | 0.1% |
|
3.82 | 353 | 0.1% |
|
5.73 | 345 | 0.1% |
|
19.0 | 345 | 0.1% |
|
Other values (1489) | 107131 | 25.4% |
|
(Missing) | 310322 | 73.6% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-265.76 | 71 | 0.0% |
|
-192.0 | 72 | 0.0% |
|
-20.0 | 72 | 0.0% |
|
-10.98 | 60 | 0.0% |
|
-10.5 | 143 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
82881.16 | 73 | 0.0% |
|
89121.94 | 74 | 0.0% |
|
92523.94 | 73 | 0.0% |
|
97740.99 | 73 | 0.0% |
|
104519.54 | 72 | 0.0% |
|
MarkDown3
Numeric
Distinct count | 1663 |
---|---|
Unique (%) | 0.4% |
Missing (%) | 67.5% |
Missing (n) | 284479 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 1439.4 |
---|---|
Minimum | -29.1 |
Maximum | 141630 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -29.1 |
---|---|
5-th percentile | 0.65 |
Q1 | 5.08 |
Median | 24.6 |
Q3 | 103.99 |
95-th percentile | 1059.9 |
Maximum | 141630 |
Range | 141660 |
Interquartile range | 98.91 |
Descriptive statistics
Standard deviation | 9623.1 |
---|---|
Coef of variation | 6.6854 |
Kurtosis | 77.688 |
Mean | 1439.4 |
MAD | 2578.1 |
Skewness | 8.3995 |
Sum | 197330000 |
Variance | 92604000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
3.0 | 754 | 0.2% |
|
6.0 | 710 | 0.2% |
|
2.0 | 660 | 0.2% |
|
1.0 | 611 | 0.1% |
|
0.22 | 487 | 0.1% |
|
0.5 | 463 | 0.1% |
|
0.01 | 444 | 0.1% |
|
4.0 | 439 | 0.1% |
|
3.2 | 379 | 0.1% |
|
1.98 | 363 | 0.1% |
|
Other values (1652) | 131781 | 31.3% |
|
(Missing) | 284479 | 67.5% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-29.1 | 72 | 0.0% |
|
-1.0 | 70 | 0.0% |
|
-0.87 | 46 | 0.0% |
|
-0.2 | 69 | 0.0% |
|
0.0 | 67 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
89402.64 | 71 | 0.0% |
|
101378.79 | 73 | 0.0% |
|
103991.94 | 72 | 0.0% |
|
109030.75 | 75 | 0.0% |
|
141630.61 | 74 | 0.0% |
|
MarkDown4
Numeric
Distinct count | 1945 |
---|---|
Unique (%) | 0.5% |
Missing (%) | 68.0% |
Missing (n) | 286603 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3383.2 |
---|---|
Minimum | 0.22 |
Maximum | 67475 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 0.22 |
---|---|
5-th percentile | 28.76 |
Q1 | 504.22 |
Median | 1481.3 |
Q3 | 3595 |
95-th percentile | 12646 |
Maximum | 67475 |
Range | 67475 |
Interquartile range | 3090.8 |
Descriptive statistics
Standard deviation | 6292.4 |
---|---|
Coef of variation | 1.8599 |
Kurtosis | 29.997 |
Mean | 3383.2 |
MAD | 3329.7 |
Skewness | 4.8475 |
Sum | 456620000 |
Variance | 39594000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
9.0 | 280 | 0.1% |
|
4.0 | 200 | 0.0% |
|
2.0 | 197 | 0.0% |
|
3.0 | 146 | 0.0% |
|
47.0 | 143 | 0.0% |
|
67.72 | 142 | 0.0% |
|
17.0 | 141 | 0.0% |
|
657.56 | 141 | 0.0% |
|
8.0 | 140 | 0.0% |
|
1330.36 | 140 | 0.0% |
|
Other values (1934) | 133297 | 31.6% |
|
(Missing) | 286603 | 68.0% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
0.22 | 57 | 0.0% |
|
0.41 | 52 | 0.0% |
|
0.46 | 48 | 0.0% |
|
0.78 | 52 | 0.0% |
|
0.87 | 49 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
52739.02 | 72 | 0.0% |
|
53603.99 | 72 | 0.0% |
|
57815.43 | 68 | 0.0% |
|
57817.56 | 74 | 0.0% |
|
67474.85 | 72 | 0.0% |
|
MarkDown5
Numeric
Distinct count | 2294 |
---|---|
Unique (%) | 0.5% |
Missing (%) | 64.1% |
Missing (n) | 270138 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 4629 |
---|---|
Minimum | 135.16 |
Maximum | 108520 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 135.16 |
---|---|
5-th percentile | 715.52 |
Q1 | 1878.4 |
Median | 3359.4 |
Q3 | 5563.8 |
95-th percentile | 11269 |
Maximum | 108520 |
Range | 108380 |
Interquartile range | 3685.4 |
Descriptive statistics
Standard deviation | 5962.9 |
---|---|
Coef of variation | 1.2882 |
Kurtosis | 107.85 |
Mean | 4629 |
MAD | 2989.8 |
Skewness | 8.1699 |
Sum | 700970000 |
Variance | 35556000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
2743.18 | 136 | 0.0% |
|
1064.56 | 120 | 0.0% |
|
9083.54 | 75 | 0.0% |
|
20371.02 | 75 | 0.0% |
|
3567.03 | 75 | 0.0% |
|
4180.29 | 75 | 0.0% |
|
3557.67 | 75 | 0.0% |
|
986.23 | 74 | 0.0% |
|
1773.53 | 74 | 0.0% |
|
14660.97 | 74 | 0.0% |
|
Other values (2283) | 150579 | 35.7% |
|
(Missing) | 270138 | 64.1% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
135.16 | 65 | 0.0% |
|
153.04 | 47 | 0.0% |
|
153.9 | 49 | 0.0% |
|
164.08 | 52 | 0.0% |
|
170.64 | 69 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
58068.14 | 69 | 0.0% |
|
63005.58 | 69 | 0.0% |
|
85851.87 | 68 | 0.0% |
|
105223.11 | 70 | 0.0% |
|
108519.28 | 68 | 0.0% |
|
Size
Numeric
Distinct count | 40 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 136730 |
---|---|
Minimum | 34875 |
Maximum | 219622 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 34875 |
---|---|
5-th percentile | 39690 |
Q1 | 93638 |
Median | 140170 |
Q3 | 202500 |
95-th percentile | 206300 |
Maximum | 219622 |
Range | 184747 |
Interquartile range | 108870 |
Descriptive statistics
Standard deviation | 60981 |
---|---|
Coef of variation | 0.446 |
Kurtosis | -1.2063 |
Mean | 136730 |
MAD | 52517 |
Skewness | -0.32585 |
Sum | 57640387438 |
Variance | 3718600000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
39690 | 20802 | 4.9% |
|
39910 | 20597 | 4.9% |
|
203819 | 20376 | 4.8% |
|
219622 | 10474 | 2.5% |
|
126512 | 10315 | 2.4% |
|
205863 | 10272 | 2.4% |
|
151315 | 10244 | 2.4% |
|
202307 | 10238 | 2.4% |
|
204184 | 10225 | 2.4% |
|
158114 | 10224 | 2.4% |
|
Other values (30) | 287803 | 68.3% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
34875 | 8999 | 2.1% |
|
37392 | 9036 | 2.1% |
|
39690 | 20802 | 4.9% |
|
39910 | 20597 | 4.9% |
|
41062 | 6751 | 1.6% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
204184 | 10225 | 2.4% |
|
205863 | 10272 | 2.4% |
|
206302 | 10113 | 2.4% |
|
207499 | 10062 | 2.4% |
|
219622 | 10474 | 2.5% |
|
Store
Numeric
Distinct count | 45 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 22.201 |
---|---|
Minimum | 1 |
Maximum | 45 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
---|---|
5-th percentile | 3 |
Q1 | 11 |
Median | 22 |
Q3 | 33 |
95-th percentile | 43 |
Maximum | 45 |
Range | 44 |
Interquartile range | 22 |
Descriptive statistics
Standard deviation | 12.785 |
---|---|
Coef of variation | 0.5759 |
Kurtosis | -1.1465 |
Mean | 22.201 |
MAD | 10.996 |
Skewness | 0.077763 |
Sum | 9359084 |
Variance | 163.46 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
13 | 10474 | 2.5% |
|
10 | 10315 | 2.4% |
|
4 | 10272 | 2.4% |
|
1 | 10244 | 2.4% |
|
2 | 10238 | 2.4% |
|
24 | 10228 | 2.4% |
|
27 | 10225 | 2.4% |
|
34 | 10224 | 2.4% |
|
20 | 10214 | 2.4% |
|
6 | 10211 | 2.4% |
|
Other values (35) | 318925 | 75.7% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 10244 | 2.4% |
|
2 | 10238 | 2.4% |
|
3 | 9036 | 2.1% |
|
4 | 10272 | 2.4% |
|
5 | 8999 | 2.1% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
41 | 10088 | 2.4% |
|
42 | 6953 | 1.6% |
|
43 | 6751 | 1.6% |
|
44 | 7169 | 1.7% |
|
45 | 9637 | 2.3% |
|
Temperature
Numeric
Distinct count | 3528 |
---|---|
Unique (%) | 0.8% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 60.09 |
---|---|
Minimum | -2.06 |
Maximum | 100.14 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -2.06 |
---|---|
5-th percentile | 27.31 |
Q1 | 46.68 |
Median | 62.09 |
Q3 | 74.28 |
95-th percentile | 87.27 |
Maximum | 100.14 |
Range | 102.2 |
Interquartile range | 27.6 |
Descriptive statistics
Standard deviation | 18.448 |
---|---|
Coef of variation | 0.307 |
Kurtosis | -0.63592 |
Mean | 60.09 |
MAD | 15.377 |
Skewness | -0.3214 |
Sum | 25332000 |
Variance | 340.33 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
50.43 | 709 | 0.2% |
|
67.87 | 646 | 0.2% |
|
72.62 | 594 | 0.1% |
|
76.67 | 583 | 0.1% |
|
70.28 | 563 | 0.1% |
|
76.03 | 555 | 0.1% |
|
50.56 | 544 | 0.1% |
|
64.05 | 542 | 0.1% |
|
64.21 | 519 | 0.1% |
|
50.81 | 487 | 0.1% |
|
Other values (3518) | 415828 | 98.6% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-2.06 | 69 | 0.0% |
|
5.54 | 68 | 0.0% |
|
6.23 | 69 | 0.0% |
|
7.46 | 69 | 0.0% |
|
9.51 | 70 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
99.2 | 46 | 0.0% |
|
99.22 | 185 | 0.0% |
|
99.66 | 48 | 0.0% |
|
100.07 | 46 | 0.0% |
|
100.14 | 44 | 0.0% |
|
Type
Categorical
Distinct count | 3 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
A |
215478
|
---|---|
B |
163495
|
C |
|
Value | Count | Frequency (%) | |
A | 215478 | 51.1% |
|
B | 163495 | 38.8% |
|
C | 42597 | 10.1% |
|
Unemployment
Numeric
Distinct count | 349 |
---|---|
Unique (%) | 0.1% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7.9603 |
---|---|
Minimum | 3.879 |
Maximum | 14.313 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 3.879 |
---|---|
5-th percentile | 5.326 |
Q1 | 6.891 |
Median | 7.866 |
Q3 | 8.572 |
95-th percentile | 12.187 |
Maximum | 14.313 |
Range | 10.434 |
Interquartile range | 1.681 |
Descriptive statistics
Standard deviation | 1.8633 |
---|---|
Coef of variation | 0.23407 |
Kurtosis | 2.7312 |
Mean | 7.9603 |
MAD | 1.283 |
Skewness | 1.1837 |
Sum | 3355800 |
Variance | 3.4719 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
8.099 | 5152 | 1.2% |
|
8.163 | 3636 | 0.9% |
|
7.852 | 3614 | 0.9% |
|
7.343 | 3416 | 0.8% |
|
7.057 | 3414 | 0.8% |
|
7.931 | 3400 | 0.8% |
|
7.441 | 3397 | 0.8% |
|
6.565 | 3370 | 0.8% |
|
8.2 | 3361 | 0.8% |
|
6.891 | 3360 | 0.8% |
|
Other values (339) | 385450 | 91.4% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
3.8789999999999996 | 287 | 0.1% |
|
4.077 | 938 | 0.2% |
|
4.125 | 1831 | 0.4% |
|
4.145 | 562 | 0.1% |
|
4.156000000000001 | 1815 | 0.4% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
13.975 | 1529 | 0.4% |
|
14.020999999999999 | 2263 | 0.5% |
|
14.099 | 2441 | 0.6% |
|
14.18 | 2423 | 0.6% |
|
14.312999999999999 | 2636 | 0.6% |
|
Weekly_Sales
Numeric
Distinct count | 359464 |
---|---|
Unique (%) | 85.3% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 15981 |
---|---|
Minimum | -4988.9 |
Maximum | 693100 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -4988.9 |
---|---|
5-th percentile | 59.975 |
Q1 | 2079.7 |
Median | 7612 |
Q3 | 20206 |
95-th percentile | 61202 |
Maximum | 693100 |
Range | 698090 |
Interquartile range | 18126 |
Descriptive statistics
Standard deviation | 22711 |
---|---|
Coef of variation | 1.4211 |
Kurtosis | 21.491 |
Mean | 15981 |
MAD | 15161 |
Skewness | 3.262 |
Sum | 6737200000 |
Variance | 515800000 |
Memory size | 6.4 MiB |
Value | Count | Frequency (%) | |
10.0 | 353 | 0.1% |
|
5.0 | 289 | 0.1% |
|
20.0 | 232 | 0.1% |
|
15.0 | 215 | 0.1% |
|
12.0 | 175 | 0.0% |
|
1.0 | 169 | 0.0% |
|
10.47 | 167 | 0.0% |
|
11.97 | 154 | 0.0% |
|
2.0 | 148 | 0.0% |
|
7.0 | 146 | 0.0% |
|
Other values (359454) | 419522 | 99.5% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-4988.94 | 1 | 0.0% |
|
-3924.0 | 1 | 0.0% |
|
-1750.0 | 1 | 0.0% |
|
-1699.0 | 1 | 0.0% |
|
-1321.48 | 1 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
474330.1 | 1 | 0.0% |
|
627962.93 | 1 | 0.0% |
|
630999.19 | 1 | 0.0% |
|
649770.18 | 1 | 0.0% |
|
693099.36 | 1 | 0.0% |
|
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2010-02-05 | 24924.50 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
2 | 1 | 3 | 2010-02-05 | 13740.12 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
3 | 1 | 4 | 2010-02-05 | 39954.04 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
4 | 1 | 5 | 2010-02-05 | 32229.38 | False | A | 151315 | 42.31 | 2.572 | NaN | NaN | NaN | NaN | NaN | 211.096358 | 8.106 |
pandas_profiling.ProfileReport(test)
Dataset info
Number of variables | 15 |
---|---|
Number of observations | 115064 |
Total Missing (%) | 7.4% |
Total size in memory | 13.3 MiB |
Average record size in memory | 121.0 B |
Variables types
Numeric | 12 |
---|---|
Categorical | 2 |
Boolean | 1 |
Date | 0 |
Text (Unique) | 0 |
Rejected | 0 |
Unsupported | 0 |
Warnings
CPI
Numeric
Distinct count | 361 |
---|---|
Unique (%) | 0.3% |
Missing (%) | 33.2% |
Missing (n) | 38162 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 176.96 |
---|---|
Minimum | 131.24 |
Maximum | 228.98 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 131.24 |
---|---|
5-th percentile | 131.48 |
Q1 | 138.4 |
Median | 192.3 |
Q3 | 223.24 |
95-th percentile | 227.78 |
Maximum | 228.98 |
Range | 97.74 |
Interquartile range | 84.842 |
Descriptive statistics
Standard deviation | 41.24 |
---|---|
Coef of variation | 0.23305 |
Kurtosis | -1.8588 |
Mean | 176.96 |
MAD | 40.222 |
Skewness | 0.071448 |
Sum | 13609000 |
Variance | 1700.7 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
132.71609679999997 | 2080 | 1.8% |
|
139.1226129 | 1664 | 1.4% |
|
201.0705712 | 825 | 0.7% |
|
224.80253140000002 | 783 | 0.7% |
|
131.537 | 704 | 0.6% |
|
132.2725714 | 703 | 0.6% |
|
131.2793548 | 702 | 0.6% |
|
131.642 | 702 | 0.6% |
|
131.4784 | 701 | 0.6% |
|
132.65377420000002 | 698 | 0.6% |
|
Other values (350) | 67340 | 58.5% |
|
(Missing) | 38162 | 33.2% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
131.2362258 | 695 | 0.6% |
|
131.2793548 | 702 | 0.6% |
|
131.3258 | 696 | 0.6% |
|
131.37666670000002 | 695 | 0.6% |
|
131.4275333 | 693 | 0.6% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
228.72986380000003 | 401 | 0.3% |
|
228.7796682 | 208 | 0.2% |
|
228.8020401 | 60 | 0.1% |
|
228.8892482 | 60 | 0.1% |
|
228.9764563 | 186 | 0.2% |
|
Date
Categorical
Distinct count | 39 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
2012-12-21 |
|
---|---|
2012-12-07 |
|
2012-12-28 |
|
Other values (36) |
106085
|
Value | Count | Frequency (%) | |
2012-12-21 | 3002 | 2.6% |
|
2012-12-07 | 2989 | 2.6% |
|
2012-12-28 | 2988 | 2.6% |
|
2012-12-14 | 2986 | 2.6% |
|
2013-02-15 | 2984 | 2.6% |
|
2012-11-23 | 2976 | 2.6% |
|
2012-11-09 | 2971 | 2.6% |
|
2013-01-04 | 2964 | 2.6% |
|
2013-02-08 | 2964 | 2.6% |
|
2012-11-30 | 2962 | 2.6% |
|
Other values (29) | 85278 | 74.1% |
|
Dept
Numeric
Distinct count | 81 |
---|---|
Unique (%) | 0.1% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 44.34 |
---|---|
Minimum | 1 |
Maximum | 99 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
---|---|
5-th percentile | 4 |
Q1 | 18 |
Median | 37 |
Q3 | 74 |
95-th percentile | 95 |
Maximum | 99 |
Range | 98 |
Interquartile range | 56 |
Descriptive statistics
Standard deviation | 30.656 |
---|---|
Coef of variation | 0.6914 |
Kurtosis | -1.2242 |
Mean | 44.34 |
MAD | 26.74 |
Skewness | 0.36242 |
Sum | 5101883 |
Variance | 939.82 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
1 | 1755 | 1.5% |
|
13 | 1755 | 1.5% |
|
91 | 1755 | 1.5% |
|
90 | 1755 | 1.5% |
|
21 | 1755 | 1.5% |
|
38 | 1755 | 1.5% |
|
82 | 1755 | 1.5% |
|
40 | 1755 | 1.5% |
|
81 | 1755 | 1.5% |
|
16 | 1755 | 1.5% |
|
Other values (71) | 97514 | 84.7% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 1755 | 1.5% |
|
2 | 1755 | 1.5% |
|
3 | 1755 | 1.5% |
|
4 | 1755 | 1.5% |
|
5 | 1738 | 1.5% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
95 | 1755 | 1.5% |
|
96 | 1350 | 1.2% |
|
97 | 1716 | 1.5% |
|
98 | 1632 | 1.4% |
|
99 | 613 | 0.5% |
|
Fuel_Price
Numeric
Distinct count | 297 |
---|---|
Unique (%) | 0.3% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3.5815 |
---|---|
Minimum | 2.872 |
Maximum | 4.125 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 2.872 |
---|---|
5-th percentile | 3.161 |
Q1 | 3.431 |
Median | 3.606 |
Q3 | 3.766 |
95-th percentile | 3.951 |
Maximum | 4.125 |
Range | 1.253 |
Interquartile range | 0.335 |
Descriptive statistics
Standard deviation | 0.23944 |
---|---|
Coef of variation | 0.066854 |
Kurtosis | -0.1176 |
Mean | 3.5815 |
MAD | 0.18861 |
Skewness | -0.39128 |
Sum | 412110 |
Variance | 0.057332 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
3.417 | 1853 | 1.6% |
|
3.583 | 1851 | 1.6% |
|
3.386 | 1793 | 1.6% |
|
3.611 | 1374 | 1.2% |
|
3.108 | 1201 | 1.0% |
|
3.4789999999999996 | 1169 | 1.0% |
|
3.597 | 1071 | 0.9% |
|
3.451 | 1043 | 0.9% |
|
3.227 | 1040 | 0.9% |
|
3.614 | 1028 | 0.9% |
|
Other values (287) | 101641 | 88.3% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
2.872 | 276 | 0.2% |
|
2.889 | 276 | 0.2% |
|
2.9139999999999997 | 193 | 0.2% |
|
2.927 | 194 | 0.2% |
|
2.957 | 279 | 0.2% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
4.079 | 282 | 0.2% |
|
4.099 | 355 | 0.3% |
|
4.104 | 186 | 0.2% |
|
4.109 | 189 | 0.2% |
|
4.125 | 166 | 0.1% |
|
IsHoliday
Boolean
Distinct count | 2 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Mean | 0.077592 |
---|
True |
|
---|---|
(Missing) |
106136
|
Value | Count | Frequency (%) | |
True | 8928 | 7.8% |
|
(Missing) | 106136 | 92.2% |
|
MarkDown1
Numeric
Distinct count | 1753 |
---|---|
Unique (%) | 1.5% |
Missing (%) | 0.1% |
Missing (n) | 149 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 7689.2 |
---|---|
Minimum | -2781.4 |
Maximum | 103180 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -2781.4 |
---|---|
5-th percentile | 189.49 |
Q1 | 1966.5 |
Median | 4842.3 |
Q3 | 9439.1 |
95-th percentile | 23141 |
Maximum | 103180 |
Range | 105970 |
Interquartile range | 7472.7 |
Descriptive statistics
Standard deviation | 10699 |
---|---|
Coef of variation | 1.3914 |
Kurtosis | 22.871 |
Mean | 7689.2 |
MAD | 6160.2 |
Skewness | 4.1727 |
Sum | 883610000 |
Variance | 114460000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
4655.55 | 74 | 0.1% |
|
13357.31 | 74 | 0.1% |
|
22673.11 | 74 | 0.1% |
|
13613.52 | 74 | 0.1% |
|
5692.66 | 74 | 0.1% |
|
10755.57 | 74 | 0.1% |
|
9753.88 | 74 | 0.1% |
|
20297.6 | 74 | 0.1% |
|
5813.45 | 73 | 0.1% |
|
7701.72 | 73 | 0.1% |
|
Other values (1742) | 114177 | 99.2% |
|
(Missing) | 149 | 0.1% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-2781.45 | 50 | 0.0% |
|
-772.21 | 43 | 0.0% |
|
-563.9 | 70 | 0.1% |
|
-16.93 | 44 | 0.0% |
|
2.14 | 46 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
80498.65 | 71 | 0.1% |
|
84139.36 | 72 | 0.1% |
|
88750.34 | 66 | 0.1% |
|
95102.5 | 71 | 0.1% |
|
103184.98 | 72 | 0.1% |
|
MarkDown2
Numeric
Distinct count | 1258 |
---|---|
Unique (%) | 1.1% |
Missing (%) | 24.9% |
Missing (n) | 28627 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3734.1 |
---|---|
Minimum | -35.74 |
Maximum | 71074 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -35.74 |
---|---|
5-th percentile | 6.14 |
Q1 | 180.35 |
Median | 742.59 |
Q3 | 2735.7 |
95-th percentile | 22672 |
Maximum | 71074 |
Range | 71110 |
Interquartile range | 2555.3 |
Descriptive statistics
Standard deviation | 8323.5 |
---|---|
Coef of variation | 2.2291 |
Kurtosis | 15.881 |
Mean | 3734.1 |
MAD | 4697.7 |
Skewness | 3.7406 |
Sum | 322760000 |
Variance | 69281000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
0.01 | 346 | 0.3% |
|
0.03 | 340 | 0.3% |
|
82.92 | 217 | 0.2% |
|
11.0 | 214 | 0.2% |
|
3.0 | 209 | 0.2% |
|
4.0 | 191 | 0.2% |
|
104.92 | 141 | 0.1% |
|
1.49 | 138 | 0.1% |
|
0.06 | 138 | 0.1% |
|
7.5 | 137 | 0.1% |
|
Other values (1247) | 84366 | 73.3% |
|
(Missing) | 28627 | 24.9% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-35.74 | 63 | 0.1% |
|
-15.45 | 71 | 0.1% |
|
-7.76 | 65 | 0.1% |
|
-3.27 | 69 | 0.1% |
|
-0.05 | 73 | 0.1% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
52304.87 | 73 | 0.1% |
|
52850.71 | 74 | 0.1% |
|
56549.69 | 73 | 0.1% |
|
59362.3 | 72 | 0.1% |
|
71074.17 | 72 | 0.1% |
|
MarkDown3
Numeric
Distinct count | 1422 |
---|---|
Unique (%) | 1.2% |
Missing (%) | 8.5% |
Missing (n) | 9829 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 2403.1 |
---|---|
Minimum | -179.26 |
Maximum | 149480 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -179.26 |
---|---|
5-th percentile | 1.18 |
Q1 | 15.1 |
Median | 78.26 |
Q3 | 272.58 |
95-th percentile | 2361.6 |
Maximum | 149480 |
Range | 149660 |
Interquartile range | 257.48 |
Descriptive statistics
Standard deviation | 13768 |
---|---|
Coef of variation | 5.7293 |
Kurtosis | 54.091 |
Mean | 2403.1 |
MAD | 4226.8 |
Skewness | 7.1461 |
Sum | 252890000 |
Variance | 189560000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
1.2 | 599 | 0.5% |
|
1.0 | 498 | 0.4% |
|
0.6 | 419 | 0.4% |
|
0.8 | 348 | 0.3% |
|
2.0 | 324 | 0.3% |
|
0.4 | 278 | 0.2% |
|
0.2 | 272 | 0.2% |
|
5.0 | 271 | 0.2% |
|
3.0 | 271 | 0.2% |
|
0.1 | 269 | 0.2% |
|
Other values (1411) | 101686 | 88.4% |
|
(Missing) | 9829 | 8.5% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-179.26 | 62 | 0.1% |
|
-89.1 | 66 | 0.1% |
|
-44.54 | 67 | 0.1% |
|
-23.97 | 72 | 0.1% |
|
-17.44 | 69 | 0.1% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
115048.81 | 73 | 0.1% |
|
130129.11 | 70 | 0.1% |
|
139621.51 | 72 | 0.1% |
|
146394.44 | 72 | 0.1% |
|
149483.31 | 73 | 0.1% |
|
MarkDown4
Numeric
Distinct count | 1484 |
---|---|
Unique (%) | 1.3% |
Missing (%) | 11.2% |
Missing (n) | 12888 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3356.2 |
---|---|
Minimum | 0.22 |
Maximum | 65345 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 0.22 |
---|---|
5-th percentile | 16.96 |
Q1 | 155.46 |
Median | 840.94 |
Q3 | 3096.9 |
95-th percentile | 14191 |
Maximum | 65345 |
Range | 65344 |
Interquartile range | 2941.5 |
Descriptive statistics
Standard deviation | 7570.5 |
---|---|
Coef of variation | 2.2557 |
Kurtosis | 25.452 |
Mean | 3356.2 |
MAD | 3897.5 |
Skewness | 4.6686 |
Sum | 342930000 |
Variance | 57312000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
3.0 | 171 | 0.1% |
|
0.63 | 154 | 0.1% |
|
358.15 | 145 | 0.1% |
|
55.46 | 142 | 0.1% |
|
2.61 | 141 | 0.1% |
|
3.97 | 138 | 0.1% |
|
4.88 | 137 | 0.1% |
|
27.44 | 136 | 0.1% |
|
970.77 | 134 | 0.1% |
|
1.92 | 120 | 0.1% |
|
Other values (1473) | 100758 | 87.6% |
|
(Missing) | 12888 | 11.2% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
0.22 | 56 | 0.0% |
|
0.63 | 154 | 0.1% |
|
0.66 | 46 | 0.0% |
|
0.78 | 54 | 0.0% |
|
1.26 | 43 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
56735.25 | 72 | 0.1% |
|
60065.82 | 72 | 0.1% |
|
63130.81 | 70 | 0.1% |
|
63830.91 | 71 | 0.1% |
|
65344.64 | 72 | 0.1% |
|
MarkDown5
Numeric
Distinct count | 1754 |
---|---|
Unique (%) | 1.5% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 3922.7 |
---|---|
Minimum | -185.17 |
Maximum | 771450 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -185.17 |
---|---|
5-th percentile | 540.89 |
Q1 | 1309.3 |
Median | 2390.4 |
Q3 | 4227.3 |
95-th percentile | 9316.7 |
Maximum | 771450 |
Range | 771630 |
Interquartile range | 2918 |
Descriptive statistics
Standard deviation | 19445 |
---|---|
Coef of variation | 4.9571 |
Kurtosis | 1494.9 |
Mean | 3922.7 |
MAD | 2983.7 |
Skewness | 37.977 |
Sum | 451360000 |
Variance | 378110000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
3113.78 | 137 | 0.1% |
|
7968.28 | 74 | 0.1% |
|
2105.14 | 74 | 0.1% |
|
18831.34 | 74 | 0.1% |
|
22677.91 | 74 | 0.1% |
|
2167.73 | 74 | 0.1% |
|
1947.25 | 74 | 0.1% |
|
21807.99 | 74 | 0.1% |
|
5449.98 | 74 | 0.1% |
|
860.36 | 73 | 0.1% |
|
Other values (1744) | 114262 | 99.3% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-185.17 | 63 | 0.1% |
|
-37.02 | 73 | 0.1% |
|
40.98 | 44 | 0.0% |
|
60.92 | 65 | 0.1% |
|
114.25 | 51 | 0.0% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
35238.98 | 72 | 0.1% |
|
43336.34 | 70 | 0.1% |
|
45050.55 | 70 | 0.1% |
|
45648.88 | 69 | 0.1% |
|
771448.1 | 71 | 0.1% |
|
Size
Numeric
Distinct count | 40 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 136500 |
---|---|
Minimum | 34875 |
Maximum | 219622 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 34875 |
---|---|
5-th percentile | 39690 |
Q1 | 93638 |
Median | 140170 |
Q3 | 202500 |
95-th percentile | 206300 |
Maximum | 219622 |
Range | 184747 |
Interquartile range | 108870 |
Descriptive statistics
Standard deviation | 61107 |
---|---|
Coef of variation | 0.44768 |
Kurtosis | -1.2144 |
Mean | 136500 |
MAD | 52641 |
Skewness | -0.32195 |
Sum | 15705970078 |
Variance | 3734100000 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
39910 | 5803 | 5.0% |
|
39690 | 5702 | 5.0% |
|
203819 | 5589 | 4.9% |
|
219622 | 2836 | 2.5% |
|
205863 | 2803 | 2.4% |
|
202307 | 2797 | 2.4% |
|
204184 | 2791 | 2.4% |
|
202505 | 2788 | 2.4% |
|
151315 | 2783 | 2.4% |
|
126512 | 2782 | 2.4% |
|
Other values (30) | 78390 | 68.1% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
34875 | 2447 | 2.1% |
|
37392 | 2473 | 2.1% |
|
39690 | 5702 | 5.0% |
|
39910 | 5803 | 5.0% |
|
41062 | 1863 | 1.6% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
204184 | 2791 | 2.4% |
|
205863 | 2803 | 2.4% |
|
206302 | 2745 | 2.4% |
|
207499 | 2756 | 2.4% |
|
219622 | 2836 | 2.5% |
|
Store
Numeric
Distinct count | 45 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 22.238 |
---|---|
Minimum | 1 |
Maximum | 45 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 1 |
---|---|
5-th percentile | 3 |
Q1 | 11 |
Median | 22 |
Q3 | 33 |
95-th percentile | 43 |
Maximum | 45 |
Range | 44 |
Interquartile range | 22 |
Descriptive statistics
Standard deviation | 12.81 |
---|---|
Coef of variation | 0.57603 |
Kurtosis | -1.1498 |
Mean | 22.238 |
MAD | 11.02 |
Skewness | 0.076773 |
Sum | 2558817 |
Variance | 164.09 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
13 | 2836 | 2.5% |
|
4 | 2803 | 2.4% |
|
19 | 2799 | 2.4% |
|
2 | 2797 | 2.4% |
|
27 | 2791 | 2.4% |
|
24 | 2790 | 2.4% |
|
6 | 2788 | 2.4% |
|
1 | 2783 | 2.4% |
|
10 | 2782 | 2.4% |
|
20 | 2774 | 2.4% |
|
Other values (35) | 87121 | 75.7% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
1 | 2783 | 2.4% |
|
2 | 2797 | 2.4% |
|
3 | 2473 | 2.1% |
|
4 | 2803 | 2.4% |
|
5 | 2447 | 2.1% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
41 | 2754 | 2.4% |
|
42 | 1962 | 1.7% |
|
43 | 1863 | 1.6% |
|
44 | 2072 | 1.8% |
|
45 | 2626 | 2.3% |
|
Temperature
Numeric
Distinct count | 1236 |
---|---|
Unique (%) | 1.1% |
Missing (%) | 0.0% |
Missing (n) | 0 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 53.942 |
---|---|
Minimum | -7.29 |
Maximum | 101.95 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | -7.29 |
---|---|
5-th percentile | 23.98 |
Q1 | 39.82 |
Median | 54.47 |
Q3 | 67.35 |
95-th percentile | 83.82 |
Maximum | 101.95 |
Range | 109.24 |
Interquartile range | 27.53 |
Descriptive statistics
Standard deviation | 18.724 |
---|---|
Coef of variation | 0.34712 |
Kurtosis | -0.49597 |
Mean | 53.942 |
MAD | 15.417 |
Skewness | -0.07357 |
Sum | 6206800 |
Variance | 350.59 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
57.25 | 312 | 0.3% |
|
70.74 | 309 | 0.3% |
|
70.18 | 309 | 0.3% |
|
38.95 | 272 | 0.2% |
|
70.01 | 263 | 0.2% |
|
57.87 | 262 | 0.2% |
|
85.0 | 261 | 0.2% |
|
52.38 | 260 | 0.2% |
|
79.15 | 260 | 0.2% |
|
58.66 | 259 | 0.2% |
|
Other values (1226) | 112297 | 97.6% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
-7.29 | 69 | 0.1% |
|
-6.61 | 69 | 0.1% |
|
-6.08 | 70 | 0.1% |
|
0.25 | 68 | 0.1% |
|
2.32 | 71 | 0.1% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
94.1 | 45 | 0.0% |
|
95.1 | 45 | 0.0% |
|
95.51 | 45 | 0.0% |
|
99.66 | 48 | 0.0% |
|
101.95 | 187 | 0.2% |
|
Type
Categorical
Distinct count | 3 |
---|---|
Unique (%) | 0.0% |
Missing (%) | 0.0% |
Missing (n) | 0 |
A |
58713
|
---|---|
B |
44500
|
C |
|
Value | Count | Frequency (%) | |
A | 58713 | 51.0% |
|
B | 44500 | 38.7% |
|
C | 11851 | 10.3% |
|
Unemployment
Numeric
Distinct count | 90 |
---|---|
Unique (%) | 0.1% |
Missing (%) | 33.2% |
Missing (n) | 38162 |
Infinite (%) | 0.0% |
Infinite (n) | 0 |
Mean | 6.8687 |
---|---|
Minimum | 3.684 |
Maximum | 10.199 |
Zeros (%) | 0.0% |
Quantile statistics
Minimum | 3.684 |
---|---|
5-th percentile | 3.932 |
Q1 | 5.771 |
Median | 6.806 |
Q3 | 8.036 |
95-th percentile | 9.91 |
Maximum | 10.199 |
Range | 6.515 |
Interquartile range | 2.265 |
Descriptive statistics
Standard deviation | 1.5834 |
---|---|
Coef of variation | 0.23053 |
Kurtosis | -0.60933 |
Mean | 6.8687 |
MAD | 1.3101 |
Skewness | 0.1414 |
Sum | 528220 |
Variance | 2.5072 |
Memory size | 1.8 MiB |
Value | Count | Frequency (%) | |
6.237 | 3377 | 2.9% |
|
9.91 | 2454 | 2.1% |
|
6.17 | 2336 | 2.0% |
|
6.266 | 2147 | 1.9% |
|
5.372000000000001 | 1871 | 1.6% |
|
8.036 | 1823 | 1.6% |
|
7.107 | 1808 | 1.6% |
|
3.932 | 1808 | 1.6% |
|
7.439 | 1805 | 1.6% |
|
8.625 | 1783 | 1.5% |
|
Other values (79) | 55690 | 48.4% |
|
(Missing) | 38162 | 33.2% |
|
Minimum 5 values
Value | Count | Frequency (%) | |
3.6839999999999997 | 556 | 0.5% |
|
3.8789999999999996 | 650 | 0.6% |
|
3.8960000000000004 | 288 | 0.3% |
|
3.9210000000000003 | 932 | 0.8% |
|
3.932 | 1808 | 1.6% |
|
Maximum 5 values
Value | Count | Frequency (%) | |
8.951 | 847 | 0.7% |
|
9.151 | 588 | 0.5% |
|
9.874 | 751 | 0.7% |
|
9.91 | 2454 | 2.1% |
|
10.199 | 1731 | 1.5% |
|
Store | Dept | Date | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
1 | 1 | 2 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
2 | 1 | 3 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
3 | 1 | 4 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
4 | 1 | 5 | 2012-11-02 | False | A | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 2737.42 | 223.462779 | 6.573 |
# Correlation for train data
train_corr=pd.DataFrame(train.corr())
train_corr.head()
Store | Dept | Weekly_Sales | IsHoliday | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Store | 1.000000 | 0.024258 | -0.085117 | -0.000522 | -0.182763 | -0.050230 | 0.065321 | -0.119676 | -0.034993 | -0.031475 | -0.009991 | -0.026777 | -0.211261 | 0.208759 |
Dept | 0.024258 | 1.000000 | 0.148749 | 0.000663 | -0.002491 | 0.004727 | 0.003544 | -0.002512 | 0.000018 | 0.001855 | 0.004176 | 0.000295 | -0.007178 | 0.007787 |
Weekly_Sales | -0.085117 | 0.148749 | 1.000000 | 0.012843 | 0.244117 | -0.002339 | 0.000089 | 0.085325 | 0.024565 | 0.060304 | 0.045325 | 0.090561 | -0.021162 | -0.025806 |
IsHoliday | -0.000522 | 0.000663 | 0.012843 | 1.000000 | 0.000797 | -0.155775 | -0.078155 | -0.035632 | 0.334327 | 0.428364 | -0.000459 | -0.053696 | -0.001933 | 0.010555 |
Size | -0.182763 | -0.002491 | 0.244117 | 0.000797 | 1.000000 | -0.058413 | 0.003632 | 0.345732 | 0.108843 | 0.048935 | 0.168266 | 0.304814 | -0.003903 | -0.068335 |
# Correlation for test data
test_corr=pd.DataFrame(test.corr())
test_corr.head()
Store | Dept | IsHoliday | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Store | 1.000000 | 0.019627 | -0.001166 | -0.186845 | -0.043495 | 0.153425 | -0.091707 | -0.041370 | -0.025177 | 0.010331 | 0.010419 | -0.214872 | 0.250321 |
Dept | 0.019627 | 1.000000 | 0.001249 | 0.001502 | 0.003970 | 0.000554 | -0.002353 | 0.001292 | 0.000247 | 0.002510 | 0.000776 | -0.006336 | 0.004087 |
IsHoliday | -0.001166 | 0.001249 | 1.000000 | -0.000443 | -0.187428 | -0.126443 | 0.355257 | 0.265402 | 0.496062 | 0.289700 | -0.019386 | -0.001475 | 0.010288 |
Size | -0.186845 | 0.001502 | -0.000443 | 1.000000 | -0.061256 | 0.055088 | 0.309614 | 0.157526 | 0.050088 | 0.155448 | 0.103681 | -0.002916 | -0.001988 |
Temperature | -0.043495 | 0.003970 | -0.187428 | -0.061256 | 1.000000 | 0.073938 | -0.168899 | -0.324280 | -0.049771 | -0.059583 | 0.003937 | 0.280861 | 0.022136 |
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(train.corr())
<matplotlib.axes._subplots.AxesSubplot at 0xdb38b38>
# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(test.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x1870d4e0>
# Store wise sales plot
train['Store'].value_counts(normalize=True).plot(kind = 'bar',fig=(4,5))
<matplotlib.axes._subplots.AxesSubplot at 0x19c1e400>
# weekly sales plot
sns.distplot(train.Weekly_Sales)
<matplotlib.axes._subplots.AxesSubplot at 0x19c1e1d0>
# Store wise sales
train.plot(kind='line', x='Weekly_Sales', y='Store', alpha=0.5)
<matplotlib.axes._subplots.AxesSubplot at 0x573d198>
# Weekly sales Type wise
sns.barplot(x=train["Weekly_Sales"],y=train["Type"])
<matplotlib.axes._subplots.AxesSubplot at 0x1249be80>
train.plot(kind='line', x='Dept', y='Weekly_Sales', alpha=0.5,fig=(4,5))
<matplotlib.axes._subplots.AxesSubplot at 0x175ffd68>
print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())
Store 0 Dept 0 Date 0 Weekly_Sales 0 IsHoliday 0 Type 0 Size 0 Temperature 0 Fuel_Price 0 MarkDown1 270031 MarkDown2 309308 MarkDown3 283561 MarkDown4 285694 MarkDown5 269283 CPI 0 Unemployment 0 dtype: int64 ****************************** Store 0 Dept 0 Date 0 IsHoliday 0 Type 0 Size 0 Temperature 0 Fuel_Price 0 MarkDown1 149 MarkDown2 28627 MarkDown3 9829 MarkDown4 12888 MarkDown5 0 CPI 38162 Unemployment 38162 dtype: int64
test['CPI']=test.groupby(['Dept'])['CPI'].transform(lambda x: x.fillna(x.mean()))
test['Unemployment']=test.groupby(['Dept'])['Unemployment'].transform(lambda x: x.fillna(x.mean()))
Other Missing Value Treatment like Markdown, Imputing it with Zero(No Markdown)
train=train.fillna(0)
test=test.fillna(0)
# Recheck the missing values.
print (train.isnull().sum())
print ("*"*30)
print (test.isnull().sum())
Store 0 Dept 0 Date 0 Weekly_Sales 0 IsHoliday 0 Type 0 Size 0 Temperature 0 Fuel_Price 0 MarkDown1 0 MarkDown2 0 MarkDown3 0 MarkDown4 0 MarkDown5 0 CPI 0 Unemployment 0 dtype: int64 ****************************** Store 0 Dept 0 Date 0 IsHoliday 0 Type 0 Size 0 Temperature 0 Fuel_Price 0 MarkDown1 0 MarkDown2 0 MarkDown3 0 MarkDown4 0 MarkDown5 0 CPI 0 Unemployment 0 dtype: int64
train.Weekly_Sales=np.where(train.Weekly_Sales>100000, 100000,train.Weekly_Sales)
train.Weekly_Sales.plot.hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x176834e0>
In this section, we select the appropriate features to train our classifier. Here, we create new features based on existing features. We also convert categorical features into numeric form.
train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 420212 entries, 0 to 421569 Data columns (total 16 columns): Store 420212 non-null int64 Dept 420212 non-null int64 Date 420212 non-null object Weekly_Sales 420212 non-null float64 IsHoliday 420212 non-null bool Type 420212 non-null object Size 420212 non-null int64 Temperature 420212 non-null float64 Fuel_Price 420212 non-null float64 MarkDown1 420212 non-null float64 MarkDown2 420212 non-null float64 MarkDown3 420212 non-null float64 MarkDown4 420212 non-null float64 MarkDown5 420212 non-null float64 CPI 420212 non-null float64 Unemployment 420212 non-null float64 dtypes: bool(1), float64(10), int64(3), object(2) memory usage: 71.7+ MB
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
# Extract date features
train['Date_dayofweek'] =train['Date'].dt.dayofweek
train['Date_month'] =train['Date'].dt.month
train['Date_year'] =train['Date'].dt.year
train['Date_day'] =train['Date'].dt.day
# For test data
test['Date_dayofweek'] =test['Date'].dt.dayofweek
test['Date_month'] =test['Date'].dt.month
test['Date_year'] =test['Date'].dt.year
test['Date_day'] =test['Date'].dt.day
print (train.Type.value_counts())
print ("*"*30)
print (test.Type.value_counts())
A 214961 B 162787 C 42464 Name: Type, dtype: int64 ****************************** A 58713 B 44500 C 11851 Name: Type, dtype: int64
print (train.IsHoliday.value_counts())
print ("*"*30)
print (test.IsHoliday.value_counts())
False 390652 True 29560 Name: IsHoliday, dtype: int64 ****************************** False 106136 True 8928 Name: IsHoliday, dtype: int64
# Combine train and test with Key
train_test_data = [train, test]
Converting Categorical Variable 'Type' into Numerical Variable For A=1 , B=2, C=3
type_mapping = {"A": 1, "B": 2, "C": 3}
for dataset in train_test_data:
dataset['Type'] = dataset['Type'].map(type_mapping)
Converting Categorical Variable 'IsHoliday' into Numerical Variable
type_mapping = {False: 0, True: 1}
for dataset in train_test_data:
dataset['IsHoliday'] = dataset['IsHoliday'].map(type_mapping)
Creating Extra Holiday Variable.If that week comes under extra holiday then 1(=Yes) else 2(=No)
# For Train Data Set
train['Super_Bowl'] = np.where((train['Date']==datetime(2010, 2, 12)) | (train['Date']==datetime(2011, 2, 11)) | (train['Date']==datetime(2012, 2, 10)) | (train['Date']==datetime(2013, 2, 8)),1,0)
train['Labour_Day'] = np.where((train['Date']==datetime(2010, 9, 10)) | (train['Date']==datetime(2011, 9, 9)) | (train['Date']==datetime(2012, 9, 7)) | (train['Date']==datetime(2013, 9, 6)),1,0)
train['Thanksgiving'] = np.where((train['Date']==datetime(2010, 11, 26)) | (train['Date']==datetime(2011, 11, 25)) | (train['Date']==datetime(2012, 11, 23)) | (train['Date']==datetime(2013, 11, 29)),1,0)
train['Christmas'] = np.where((train['Date']==datetime(2010, 12, 31)) | (train['Date']==datetime(2011, 12, 30)) | (train['Date']==datetime(2012, 12, 28)) | (train['Date']==datetime(2013, 12, 27)),1,0)
#For Test Data set........................................................................
test['Super_Bowl'] = np.where((test['Date']==datetime(2010, 2, 12)) | (test['Date']==datetime(2011, 2, 11)) | (test['Date']==datetime(2012, 2, 10)) | (test['Date']==datetime(2013, 2, 8)),1,0)
test['Labour_Day'] = np.where((test['Date']==datetime(2010, 9, 10)) | (test['Date']==datetime(2011, 9, 9)) | (test['Date']==datetime(2012, 9, 7)) | (test['Date']==datetime(2013, 9, 6)),1,0)
test['Thanksgiving'] = np.where((test['Date']==datetime(2010, 11, 26)) | (test['Date']==datetime(2011, 11, 25)) | (test['Date']==datetime(2012, 11, 23)) | (test['Date']==datetime(2013, 11, 29)),1,0)
test['Christmas'] = np.where((test['Date']==datetime(2010, 12, 31)) | (test['Date']==datetime(2011, 12, 30)) | (test['Date']==datetime(2012, 12, 28)) | (test['Date']==datetime(2013, 12, 27)),1,0)
# Change the isHoliday value depending on these new holidays...
train['IsHoliday']=train['IsHoliday']|train['Super_Bowl']|train['Labour_Day']|train['Thanksgiving']|train['Christmas']
test['IsHoliday']=test['IsHoliday']|test['Super_Bowl']|test['Labour_Day']|test['Thanksgiving']|test['Christmas']
# Count of holiday for train data
print (train.Christmas.value_counts())
print (train.Super_Bowl.value_counts())
print (train.Thanksgiving.value_counts())
print (train.Labour_Day.value_counts())
0 414303 1 5909 Name: Christmas, dtype: int64 0 411339 1 8873 Name: Super_Bowl, dtype: int64 0 414266 1 5946 Name: Thanksgiving, dtype: int64 0 411380 1 8832 Name: Labour_Day, dtype: int64
# Count of holiday for Test data
print (test.Christmas.value_counts())
print (test.Super_Bowl.value_counts())
print (test.Thanksgiving.value_counts())
print (test.Labour_Day.value_counts())
0 112076 1 2988 Name: Christmas, dtype: int64 0 112100 1 2964 Name: Super_Bowl, dtype: int64 0 112088 1 2976 Name: Thanksgiving, dtype: int64 0 115064 Name: Labour_Day, dtype: int64
# Since we have Imputed IsHoliday according to Extra holidays..These extra holiday variable has redundant..
# Droping the Extra holiday variables because its redundant..
dp=['Super_Bowl','Labour_Day','Thanksgiving','Christmas']
train.drop(dp,axis=1,inplace=True)
test.drop(dp,axis=1,inplace=True)
train.head(2)
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | MarkDown5 | CPI | Unemployment | Date_dayofweek | Date_month | Date_year | Date_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2010-02-05 | 24924.50 | 0 | 1 | 151315 | 42.31 | 2.572 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 211.096358 | 8.106 | 4 | 2 | 2010 | 5 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | 0 | 1 | 151315 | 42.31 | 2.572 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 211.096358 | 8.106 | 4 | 2 | 2010 | 5 |
-Since we have imputed markdown variables therefore we will not be removing the all markdown variables.
-Removing MarkDown5 because its Highly Skewed.
features_drop=['Unemployment','CPI','MarkDown5']
train=train.drop(features_drop, axis=1)
test=test.drop(features_drop, axis=1)
train.head(2)
Store | Dept | Date | Weekly_Sales | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | Date_dayofweek | Date_month | Date_year | Date_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2010-02-05 | 24924.50 | 0 | 1 | 151315 | 42.31 | 2.572 | 0.0 | 0.0 | 0.0 | 0.0 | 4 | 2 | 2010 | 5 |
1 | 1 | 2 | 2010-02-05 | 50605.27 | 0 | 1 | 151315 | 42.31 | 2.572 | 0.0 | 0.0 | 0.0 | 0.0 | 4 | 2 | 2010 | 5 |
test.head(2)
Store | Dept | Date | IsHoliday | Type | Size | Temperature | Fuel_Price | MarkDown1 | MarkDown2 | MarkDown3 | MarkDown4 | Date_dayofweek | Date_month | Date_year | Date_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 2012-11-02 | 0 | 1 | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 4 | 11 | 2012 | 2 |
1 | 1 | 2 | 2012-11-02 | 0 | 1 | 151315 | 55.32 | 3.386 | 6766.44 | 5147.7 | 50.82 | 3639.9 | 4 | 11 | 2012 | 2 |
# Converting all float var to int integer..
for var in train:
if train[var].dtypes == float:
train[var]=train[var].astype(int)
for var in test:
if test[var].dtypes == float:
test[var]=test[var].astype(int)
import seaborn as sns
sns.distplot(train.Weekly_Sales)
<matplotlib.axes._subplots.AxesSubplot at 0xc2e0f98>
As we can see above fig y is not normally distributed so we will take log of Y
train['Weekly_Sales']=np.log(train['Weekly_Sales']+1)
sns.distplot(train.Weekly_Sales)
<matplotlib.axes._subplots.AxesSubplot at 0xc3a8588>
## Use Box cox transformation need to plot.
# Now check residuals
from scipy import stats
import pylab
stats.probplot(train.Weekly_Sales, dist="norm", plot=pylab )
pylab.show()
#### train X= Exery thing except Weekly_Sales
train_X=train.drop(['Weekly_Sales','Date'], axis=1)
#### train Y= Only Weekly_Sales
train_y=train['Weekly_Sales']
#### Test_X
test_X=test.drop('Date',axis=1).copy()
train_X.shape, train_y.shape, test_X.shape
((420212, 15), (420212,), (115064, 15))
## Methood 1..
clf = LinearRegression()
clf.fit(train_X, train_y)
y_pred_linear=clf.predict(test_X)
acc_linear=round( clf.score(train_X, train_y) * 100, 2)
print ('scorbe:'+str(acc_linear) + ' percent')
scorbe:11.03 percent
import statsmodels.api as sm
train_x = sm.add_constant(train_X)
lm=sm.OLS(train_y,train_X).fit()
print(lm.summary())
OLS Regression Results ============================================================================== Dep. Variable: Weekly_Sales R-squared: 0.110 Model: OLS Adj. R-squared: 0.110 Method: Least Squares F-statistic: 3720. Date: Tue, 03 Sep 2019 Prob (F-statistic): 0.00 Time: 21:56:39 Log-Likelihood: -8.6782e+05 No. Observations: 420212 AIC: 1.736e+06 Df Residuals: 420197 BIC: 1.736e+06 Df Model: 14 Covariance Type: nonrobust ================================================================================== coef std err t P>|t| [0.025 0.975] ---------------------------------------------------------------------------------- Store -0.0134 0.000 -56.412 0.000 -0.014 -0.013 Dept 0.0016 9.65e-05 16.264 0.000 0.001 0.002 IsHoliday -0.0520 0.012 -4.169 0.000 -0.076 -0.028 Type 0.1135 0.008 14.836 0.000 0.099 0.129 Size 1.084e-05 8.38e-08 129.421 0.000 1.07e-05 1.1e-05 Temperature -0.0036 0.000 -21.059 0.000 -0.004 -0.003 Fuel_Price 0.0329 0.008 4.130 0.000 0.017 0.049 MarkDown1 1.071e-05 1.02e-06 10.539 0.000 8.72e-06 1.27e-05 MarkDown2 -8.538e-07 6.17e-07 -1.384 0.166 -2.06e-06 3.56e-07 MarkDown3 3.695e-06 5.59e-07 6.613 0.000 2.6e-06 4.79e-06 MarkDown4 -6.834e-06 1.43e-06 -4.794 0.000 -9.63e-06 -4.04e-06 Date_dayofweek 35.7844 3.199 11.187 0.000 29.515 42.054 Date_month 0.0160 0.001 16.088 0.000 0.014 0.018 Date_year -0.0676 0.006 -10.621 0.000 -0.080 -0.055 Date_day -0.0004 0.000 -1.049 0.294 -0.001 0.000 ============================================================================== Omnibus: 79726.360 Durbin-Watson: 1.429 Prob(Omnibus): 0.000 Jarque-Bera (JB): 156534.468 Skew: -1.156 Prob(JB): 0.00 Kurtosis: 4.896 Cond. No. 1.63e+08 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 1.63e+08. This might indicate that there are strong multicollinearity or other numerical problems.
clf = RandomForestRegressor(n_estimators=100)
clf.fit(train_X, train_y)
y_pred_rf=clf.predict(test_X)
acc_rf= round(clf.score(train_X, train_y) * 100, 2)
print ("Accuracy: %i %% \n"%acc_rf)
Accuracy: 99 %
clf=DecisionTreeRegressor()
clf.fit(train_X, train_y)
y_pred_dt= clf.predict(test_X)
acc_dt = round( clf.score(train_X, train_y) * 100, 2)
print (str(acc_dt) + ' percent')
100.0 percent
models = pd.DataFrame({
'Model': ['Linear Regression','Random Forest','Decision Tree'],
'Score': [acc_linear, acc_rf,acc_dt]
})
models.sort_values(by='Score', ascending=False)
Model | Score | |
---|---|---|
2 | Decision Tree | 100.00 |
1 | Random Forest | 99.63 |
0 | Linear Regression | 11.03 |
# Prediction value using Random Forest model..
submission = pd.DataFrame({
"Store_Dept_Date": test.Store.astype(str)+'_'+test.Dept.astype(str)+'_'+test.Date.astype(str),
"Weekly_Sales": y_pred_rf
})
submission.to_csv('weekly_sales predicted.csv', index=False)
#submission.to_excel(writer,'Weekly_sales Pred',index=False)
submission.head()
Store_Dept_Date | Weekly_Sales | |
---|---|---|
0 | 1_1_2012-11-02 | 10.268372 |
1 | 1_2_2012-11-02 | 10.766197 |
2 | 1_3_2012-11-02 | 9.306946 |
3 | 1_4_2012-11-02 | 10.562144 |
4 | 1_5_2012-11-02 | 10.341259 |
##########################End##########